---
title: "Replication data for Bureaucratic Responsiveness in Humanitarian Aid, Main Document"
output: html_notebook
---

This is the R code used for the data analyses presented in the article "Bureaucratic Responsiveness in Humanitarian Aid". 

The code here is used to render all tables and figures included in the main document. 

First, load the environment and main data set. 

```{r}
Sys.setenv(LANGUAGE = "en")


library(dplyr)
library(ggplot2)
library(fixest)
library(broom)
library(mice)
library(miceadds)
library(purrr)
library(tidyr)
library(margins)
library(interactions)


data <- read.csv(file = "maindata_fpa_replication.csv")
```

Log transform all numeric variables.  

```{r}
#first replace 0 values within affected_ofda with NA, as these values are missing data points. 

data$affected_ofda[data$affected_ofda == 0] <- NA


numeric_vars <- sapply(data, is.numeric)

for (var in names(data)[numeric_vars]) {
  temp <- data[[var]]
  
  temp[temp == 0] <- 0.1
  
  if (var == "lag_trade_bal") {
    data[[paste0("log_", var)]] <- sign(temp) * log(abs(temp))
  } else {
    temp[temp < 0 | is.na(temp)] <- NA
    data[[paste0("log_", var)]] <- log(temp)
  }
}


```

Center all logged independent and control variables. 

```{r}
vars_to_center <- c("log_med_h", "log_med_words", "log_ch_words", "log_affected_ofda", 
                    "log_lag_trade_bal", "log_lag_adj_gdp", "log_lag_pop")

data <- data %>%
  mutate(across(all_of(vars_to_center), ~ as.numeric(scale(. , center = TRUE, scale = FALSE)), .names = "centered_{.col}"))
```

Run the regression using the mice package to impute missing values for the variables of interest. Imputed values are determined by pooling the results of five imputations. 

Control variables are introduced stepwise. 

```{r}
data <- data %>% select(-affected_ofda, -lag_trade_bal,
                        -lag_adj_gdp, -lag_pop, -troops, 
                        -ch_words, -med_words, -med_h, -adj_ofda, -log_troops)

tempData <- mice(data, m=5, maxit=50, meth='pmm', seed=500, print = FALSE)
datlist <- mids2datlist(tempData)

mod1 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               case + factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod1, coef)
vars <- lapply(mod1, vcov)
sum1 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)

mod2 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words+ 
               centered_log_med_h +
               centered_log_affected_ofda + 
               case + factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod2, coef)
vars <- lapply(mod2, vcov)
sum2 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)


mod3 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal +
               case + factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod3, coef)
vars <- lapply(mod3, vcov)
sum3 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)


mod4 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
               case + factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod4, coef)
vars <- lapply(mod4, vcov)
sum4 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)


mod5 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
               centered_log_lag_pop +
               case + factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod5, coef)
vars <- lapply(mod5, vcov)
sum5 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)

mod6 <- lapply(datlist, FUN = function(i){
  lm.cluster(i, formula =
               log_adj_ofda ~ centered_log_med_words + 
               centered_log_med_h +
               centered_log_affected_ofda + 
               centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
               centered_log_lag_pop +
               centered_log_ch_words +
               case + factor(year) +
               centered_log_med_h*centered_log_med_words, 
             cluster=i$case)
})
betas <- lapply(mod6, coef)
vars <- lapply(mod6, vcov)
sum6 <- summary(pool_mi(qhat = betas, u = vars), conf.int = TRUE)



```

Extract the coefficients, se, and p-values and compile them into one formatted table.
These results should match up with "Table 1. Estimated Coefficients for CE Aid Allocations"

```{r}


process_summary <- function(sum_table, model_name) {
  df <- data.frame(
    term = rownames(sum_table),           
    results = sum_table[, "results"],    
    se = sum_table[, "se"],       
    p = sum_table[, "p"],         
    model = model_name                    
  ) %>%
    filter(!grepl("case|year", term)) %>%
    mutate(
      # Append significance markers to results
      results = case_when(
        p < 0.01 ~ paste0(results, "***"),
        p < 0.05 ~ paste0(results, "**"),
        p < 0.101 ~ paste0(results, "*"),
        TRUE ~ as.character(results)
      )
    ) %>%
    select(-p)  # Remove p-values column

  return(df)
}

summary_tables <- list(sum1, sum2, sum3, sum4, sum5, sum6)
model_names <- c("Model 1", "Model 2", "Model 3", "Model 4", "Model 5", "Model 6")
combined_data <- map2_df(summary_tables, model_names, process_summary)

formatted_table <- combined_data %>%
  pivot_wider(names_from = model, values_from = c(results, se))

print(formatted_table)


```

Calculate the r-squared value for the pooled models.
These results should match up with "Table 1. Estimated Coefficients for CE Aid Allocations"


```{r}
calc_r_squared <- function(model_list, datlist, formula) {
  r2_values <- sapply(1:length(datlist), function(i) {
    model <- model_list[[i]]
    data <- datlist[[i]]

    # Ensure complete cases for all variables in the model
    complete_data <- model.frame(as.formula(formula), data = data, na.action = na.omit)

    # Extract response variable
    y <- complete_data$log_adj_ofda

    # Construct design matrix
    X <- model.matrix(as.formula(formula), data = complete_data)

    # Extract coefficients
    beta <- coef(model)

    # Ensure column names in X match the coefficient names in beta
    common_vars <- intersect(names(beta), colnames(X))
    X <- X[, common_vars, drop = FALSE]
    beta <- beta[common_vars]

    # Compute fitted values
    y_hat <- X %*% beta  

    # Compute R-squared
    ss_total <- sum((y - mean(y, na.rm = TRUE))^2)
    ss_residual <- sum((y - y_hat)^2)
    1 - (ss_residual / ss_total)
  })

  # Pooling R^2 using Rubin’s Rules
  m <- length(r2_values)
  r_bar <- mean(r2_values)
  b_var <- var(r2_values)
  w_var <- mean((r2_values - r_bar)^2)
  total_var <- ((m + 1) / m) * b_var - (1 / m) * w_var

  lower_ci <- r_bar - 1.96 * sqrt(total_var)
  upper_ci <- r_bar + 1.96 * sqrt(total_var)

  return(list(R_squared = r_bar, CI = c(lower_ci, upper_ci)))
}


models <- list(mod1, mod2, mod3, mod4, mod5, mod6)
formulas <- list(
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + case + factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    case + factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + 
    case + factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
    case + factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
    centered_log_lag_pop + case + factor(year) +
    centered_log_med_h * centered_log_med_words,
  
  log_adj_ofda ~ centered_log_med_words + 
    centered_log_med_h + centered_log_affected_ofda + 
    centered_log_lag_trade_bal + centered_log_lag_adj_gdp + 
    centered_log_lag_pop + centered_log_ch_words +
    case + factor(year) +
    centered_log_med_h * centered_log_med_words
)

r2_results <- lapply(seq_along(models), function(i) {
  cat("\nCalculating R² for Model", i, "...\n")
  calc_r_squared(models[[i]], datlist, formulas[[i]])
})

# Convert results into a data frame for easy viewing
r2_df <- data.frame(
  Model = paste0("mod", 1:6),
  R_squared = sapply(r2_results, function(x) x$R_squared),
  CI_lower = sapply(r2_results, function(x) x$CI[1]),
  CI_upper = sapply(r2_results, function(x) x$CI[2])
)

# Print results
print(r2_df)
```

Calculate the marginal effect of media volume on aid at different levels of media diversity. 

These results should match up with "Table 2. Marginal Effects of Media Volume on Aid at Different Levels of Media Diversity"

```{r}

#Mod5 will be used as the model of best fit
mod5 <- lapply(datlist, function(i) {
  lm(log_adj_ofda ~ centered_log_med_words + 
       centered_log_med_h + centered_log_affected_ofda +
       centered_log_lag_trade_bal + centered_log_lag_adj_gdp +
      centered_log_lag_pop + case + factor(year) + 
       centered_log_med_h * centered_log_med_words, data = i)
})

calculate_marginal_effect <- function(model, centered_log_med_h_level) {
  coef <- summary(model)$coefficients
  beta1 <- coef["centered_log_med_words", "Estimate"]
  beta3 <- coef["centered_log_med_words:centered_log_med_h", "Estimate"]
  
  marginal_effect <- beta1 + beta3 * centered_log_med_h_level
  return(marginal_effect)
}


marginal_effects_list <- lapply(mod5, function(model) {
  mean_log_med_h <- mean(model$model$centered_log_med_h)
  sd_log_med_h <- sd(model$model$centered_log_med_h)
  
  list(
    mean = calculate_marginal_effect(model, mean_log_med_h),
    mean_minus_sd = calculate_marginal_effect(model, mean_log_med_h - sd_log_med_h),
    mean_plus_sd = calculate_marginal_effect(model, mean_log_med_h + sd_log_med_h)
  )
})

extract_effects <- function(effect_list, level) {
  sapply(effect_list, function(e) e[[level]])
}

# Extract the marginal effects for each level
mean_effects <- extract_effects(marginal_effects_list, "mean")
mean_minus_sd_effects <- extract_effects(marginal_effects_list, "mean_minus_sd")
mean_plus_sd_effects <- extract_effects(marginal_effects_list, "mean_plus_sd")


# Calculate the average marginal effects across imputed datasets
average_mean_effect <- mean(mean_effects)
average_mean_minus_sd_effect <- mean(mean_minus_sd_effects)
average_mean_plus_sd_effect <- mean(mean_plus_sd_effects)

# Base effects for 1% change
base_effects <- c(average_mean_minus_sd_effect, average_mean_effect, average_mean_plus_sd_effect)

# Compute effects at different percentage changes
marginal_effects_df <- data.frame(
  Level = c("Mean - SD", "Mean", "Mean + SD"),
  Effect_1pct   = base_effects,
  Effect_10pct  = base_effects * 10,
  Effect_50pct  = base_effects * 50,
  Effect_100pct = base_effects * 100,
  Effect_250pct = base_effects * 250
)

# View the data frame
print(marginal_effects_df)


```

Visualize the interaction effect between media volume and media diversity. 

This should match the results shown in "Figure 1"
```{r}

#Use mod5 as the model of best fit
mod5_rep <- mod5[[1]]
mean_h <- mean(datlist[[1]]$centered_log_med_h, na.rm = TRUE)
sd_h <- sd(datlist[[1]]$centered_log_med_h, na.rm = TRUE)

interact_plot(mod5_rep, 
              pred = centered_log_med_words, 
              modx = centered_log_med_h, 
              modx.values = c(mean_h - 2*sd_h, mean_h, mean_h + 2*sd_h), 
              interval = FALSE, 
              modx.labels = c("-2 SD", "Mean", "+2 SD"),
              main.title = "Interaction Plot",
              x.label = "Log Media Volume (Centered)",
              y.label = "Log Adjusted OFDA",
              legend.main = "Log Media Diversity (Centered)")


```

